import warnings
import lux
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager
import seaborn as sns
import plotly
import datetime as dt
import statsmodels.api as sm
import os.path
import os
import sys
import gc
import lightgbm as lgb
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, Ridge, Lasso
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.utils import shuffle
from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from datetime import datetime
from dateutil import relativedelta
def read_exc(file, sheet):
path = os.path.join(r'C:\Users\mat.prokopenko\Desktop\intro\eda', file)
df = pd.read_excel(path, sheet_name = sheet, header = 0)
return df
def add_ts_params(df):
df['Date'] = pd.to_datetime(df['Full Date Alternate Key'], dayfirst = 1)
#df['Week_day'] = df['Date'].dt.dayofweek + 1
df['Month'] = df['Date'].dt.month
#df['Year'] = df['Date'].dt.year
#df['Day_in_Month'] = df['Date'].dt.days_in_month
return df
df = read_exc('test3_0.xlsx', 'answ')
df.head()
df.columns
work = []
study = []
for i in df['Робота-навчання (можна обрати декілька)']:
w = 0
s = 0
if ('Працюю повний робочий день' in i) or ('Працюю парт-тайм' in i):
work.append(1)
w +=1
if 'Навчаюсь в університеті' in i:
study.append(1)
s+=1
if w == 0:
work.append(0)
if s == 0:
study.append(0)
silpo = []
atb = []
varus = []
novus = []
auchan = []
for l in df['Де ви найчастіше купуєте продукти? (не більше 3 варіантів)']:
sil = 0
at = 0
var = 0
nov = 0
auc = 0
if 'Сільпо' in l:
silpo.append(1)
sil+=1
if 'Атб' in l:
atb.append(1)
at+=1
if 'Varus' in l:
varus.append(1)
var+=1
if 'Novus' in l:
novus.append(1)
nov+=1
if 'Ашан' in l:
auchan.append(1)
auc+=1
if sil == 0:
silpo.append(0)
if at == 0:
atb.append(0)
if var == 0:
varus.append(0)
if nov == 0:
novus.append(0)
if auc == 0:
auchan.append(0)
fins = []
for i in df['Фінансова допомога ']:
if i == 'Мене повністю забезпечує хтось інший':
fins.append(1)
if i == 'Я повністю себе забезпечую':
fins.append(2)
if i == 'Частково я забезпечую себе сам (з-п та/або стипендія), частково - допомагає хтось інший':
fins.append(3)
sel_col = df[['CHOICE1', 'CHOICE2', 'CHOICE3', 'CHOICE4', 'CHOICE5', 'CHOICE6',
'CHOICE7', 'CHOICE8', 'CHOICE9', 'CHOICE10']]
dffin = pd.DataFrame()
dffin = sel_col.copy()
dffin['sex'] = df['Оберіть свою стать']
dffin['os'] = df['Якою ОС на смартфоні ви користуєтесь?']
dffin['is_working'] = work
dffin['is_studying'] = study
dffin['silpo'] = silpo
dffin['atb'] = atb
dffin['varus'] = varus
dffin['novus'] = novus
dffin['auchan'] = auchan
dffin['finhelp'] = fins
dffin = dffin[dffin['sex']!='Бажаю не говорити']
sexs = []
oss = []
for i in dffin['sex']:
if i == 'Чоловік':
sexs.append(1)
if i == 'Жінка':
sexs.append(2)
for l in dffin['os']:
if l == 'Android':
oss.append(1)
if l == 'IOS':
oss.append(2)
dffin['sex'] = sexs
dffin['os'] = oss
coltoch = ['CHOICE1', 'CHOICE2', 'CHOICE3', 'CHOICE4']
dffin1 = dffin[dffin['CHOICE1']!='Не можу визначитись']
dffin1 = dffin1[dffin1['CHOICE2']!='Не можу визначитись']
dffin1 = dffin1[dffin1['CHOICE3']!='Не можу визначитись']
dffin1 = dffin1[dffin1['CHOICE4']!='Не можу визначитись']
dffin1['CHOICE1'] = dffin1['CHOICE1'].astype('int64')
dffin1['CHOICE2'] = dffin1['CHOICE2'].astype('int64')
dffin1['CHOICE3'] = dffin1['CHOICE3'].astype('int64')
dffin1['CHOICE4'] = dffin1['CHOICE4'].astype('int64')
dffin1.info()
Train_test_split + minmaxscaler
X = dffin1
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(X, test_size=0.2, random_state = 42)
from sklearn.preprocessing import MinMaxScaler
scal = MinMaxScaler()
Xscaltr = scal.fit_transform(X_train)
Xscalts = scal.fit_transform(X_test)
Xscal = scal.fit_transform(X)
PCA
from sklearn.decomposition import PCA
pca1 = PCA(n_components=2)
Xscaltrpca = pca1.fit_transform(Xscaltr)
Xscaltspca = pca1.fit_transform(Xscalts)
Xscalpca = pca1.fit_transform(Xscal)
Kmeans clustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 3).fit(Xscalpca)
clust = kmeans.predict(Xscalpca)
x1 = Xscalpca[:,0]
y1 = Xscalpca[:,1]
plt.scatter(x1, y1, c = clust)
centroids1 = kmeans.cluster_centers_
centrX1 = centroids1[:,0]
centrY1 = centroids1[:,1]
plt.scatter(centrX1, centrY1, marker = 'D', s = 50)
plt.show
# Calculate silhouette_score
from sklearn.metrics import silhouette_samples, silhouette_score
print(silhouette_score(Xscalpca, clust))
for i, k in enumerate([2, 3, 4]):
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# Run the Kmeans algorithm
km = KMeans(n_clusters=k)
labels = km.fit_predict(Xscalpca)
centroids = km.cluster_centers_
# Get silhouette samples
silhouette_vals = silhouette_samples(Xscalpca, labels)
# Silhouette plot
y_ticks = []
y_lower, y_upper = 0, 0
for i, cluster in enumerate(np.unique(labels)):
cluster_silhouette_vals = silhouette_vals[labels == cluster]
cluster_silhouette_vals.sort()
y_upper += len(cluster_silhouette_vals)
ax1.barh(range(y_lower, y_upper), cluster_silhouette_vals, edgecolor='none', height=1)
ax1.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
y_lower += len(cluster_silhouette_vals)
# Get the average silhouette score and plot it
avg_score = np.mean(silhouette_vals)
ax1.axvline(avg_score, linestyle='--', linewidth=2, color='green')
ax1.set_yticks([])
ax1.set_xlim([-0.1, 1])
ax1.set_xlabel('Silhouette coefficient values')
ax1.set_ylabel('Cluster labels')
ax1.set_title('Silhouette plot for the various clusters', y=1.02);
# Scatter plot of data colored with labels
ax2.scatter(Xscalpca[:, 0], Xscalpca[:, 1], c=labels)
ax2.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='r', s=250)
ax2.set_xlim([-2, 2])
ax2.set_xlim([-2, 2])
ax2.set_xlabel('Eruption time in mins')
ax2.set_ylabel('Waiting time to next eruption')
ax2.set_title('Visualization of clustered data', y=1.02)
ax2.set_aspect('equal')
plt.tight_layout()
plt.suptitle(f'Silhouette analysis using k = {k}',
fontsize=16, fontweight='semibold', y=1.05);
predictions = kmeans.predict(Xscaltspca)
xs = Xscaltspca[:,0]
ys = Xscaltspca[:,1]
plt.scatter(xs, ys, c = predictions)
centroids = kmeans.cluster_centers_
centrX = centroids[:,0]
centrY = centroids[:,1]
plt.scatter(centrX, centrY, marker = 'D', s = 50)
plt.show
ks = range(1, 10)
inertias = []
for k in ks:
# Create a KMeans instance with k clusters: model
model = KMeans(n_clusters = k)
# Fit model to samples
model.fit(Xscalpca)
# Append the inertia to the list of inertias
inertias.append(model.inertia_)
# Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
X_cl = X.copy()
X_cl['cluster'] = clust
X_cl.columns
X_cl.groupby('cluster').median()
X_cl.to_excel(r'C:\Users\mat.prokopenko\Desktop\intro\eda\wsex.xlsx')
X_cl.groupby('cluster').median().to_excel(r'C:\Users\mat.prokopenko\Desktop\intro\eda\wsexclust.xlsx')
Spectral clustering w/ sex parameter
from sklearn.cluster import SpectralClustering
sp = SpectralClustering(n_clusters=3, affinity='rbf')
sp.fit(Xscalpca)
clustsp = sp.labels_
x1 = Xscalpca[:,0]
y1 = Xscalpca[:,1]
plt.scatter(x1, y1, c = clustsp)
plt.show
print(silhouette_score(Xscalpca, clustsp))
X_clsp = X.copy()
X_clsp['cluster'] = clustsp
X_clsp.groupby('cluster').median()
X_clsp.to_excel(r'C:\Users\mat.prokopenko\Desktop\intro\eda\wsexsp.xlsx')
X_clsp.groupby('cluster').median().to_excel(r'C:\Users\mat.prokopenko\Desktop\intro\eda\wsexclustsp.xlsx')
Lets trash so-called 'sex'
from sklearn.preprocessing import MinMaxScaler
scal = MinMaxScaler()
Xnosextr = X_train.drop('sex', axis = 1)
Xnosexts = X_test.drop('sex', axis = 1)
Xscaltrnosex = scal.fit_transform(Xnosextr)
Xscaltsnosex = scal.fit_transform(Xnosexts)
Xnosex = X.drop('sex', axis = 1)
Xscalnosex = scal.fit_transform(Xnosex)
from sklearn.decomposition import PCA
pca2 = PCA(n_components=2)
Xscaltrpcanosex = pca2.fit_transform(Xscaltrnosex)
Xscaltspcanosex = pca2.fit_transform(Xscaltsnosex)
Xscalpcanosex = pca2.fit_transform(Xscalnosex)
from sklearn.cluster import KMeans
kmeans2 = KMeans(n_clusters = 3).fit(Xscalpcanosex)
clust2 = kmeans.predict(Xscalpcanosex)
x2 = Xscalpcanosex[:,0]
y2 = Xscalpcanosex[:,1]
plt.scatter(x2, y2, c = clust2)
centroids2 = kmeans2.cluster_centers_
centrX2 = centroids2[:,0]
centrY2 = centroids2[:,1]
plt.scatter(centrX2, centrY2, marker = 'D', s = 50)
plt.show
# Calculate silhouette_score
from sklearn.metrics import silhouette_samples, silhouette_score
print(silhouette_score(Xscalpcanosex, clust2))
for i, k in enumerate([2, 3, 4]):
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# Run the Kmeans algorithm
km = KMeans(n_clusters=k)
labels = km.fit_predict(Xscalpcanosex)
centroids = km.cluster_centers_
# Get silhouette samples
silhouette_vals = silhouette_samples(Xscalpcanosex, labels)
# Silhouette plot
y_ticks = []
y_lower, y_upper = 0, 0
for i, cluster in enumerate(np.unique(labels)):
cluster_silhouette_vals = silhouette_vals[labels == cluster]
cluster_silhouette_vals.sort()
y_upper += len(cluster_silhouette_vals)
ax1.barh(range(y_lower, y_upper), cluster_silhouette_vals, edgecolor='none', height=1)
ax1.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
y_lower += len(cluster_silhouette_vals)
# Get the average silhouette score and plot it
avg_score = np.mean(silhouette_vals)
ax1.axvline(avg_score, linestyle='--', linewidth=2, color='green')
ax1.set_yticks([])
ax1.set_xlim([-0.1, 1])
ax1.set_xlabel('Silhouette coefficient values')
ax1.set_ylabel('Cluster labels')
ax1.set_title('Silhouette plot for the various clusters', y=1.02);
# Scatter plot of data colored with labels
ax2.scatter(Xscalpcanosex[:, 0], Xscalpcanosex[:, 1], c=labels)
ax2.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='r', s=250)
ax2.set_xlim([-2, 2])
ax2.set_xlim([-2, 2])
ax2.set_xlabel('Eruption time in mins')
ax2.set_ylabel('Waiting time to next eruption')
ax2.set_title('Visualization of clustered data', y=1.02)
ax2.set_aspect('equal')
plt.tight_layout()
plt.suptitle(f'Silhouette analysis using k = {k}',
fontsize=16, fontweight='semibold', y=1.05);
predictions2 = kmeans2.predict(Xscaltspcanosex)
xs2 = Xscaltspcanosex[:,0]
ys2 = Xscaltspcanosex[:,1]
plt.scatter(xs2, ys2, c = predictions2)
centroids2 = kmeans2.cluster_centers_
centrX2 = centroids2[:,0]
centrY2 = centroids2[:,1]
plt.scatter(centrX2, centrY2, marker = 'D', s = 50)
plt.show
ks2 = range(1, 10)
inertias2 = []
for k in ks2:
# Create a KMeans instance with k clusters: model
model2 = KMeans(n_clusters = k)
# Fit model to samples
model2.fit(Xscalpcanosex)
# Append the inertia to the list of inertias
inertias2.append(model2.inertia_)
# Plot ks vs inertias
plt.plot(ks2, inertias2, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks2)
plt.show()
X_cl2 = Xnosex.copy()
X_cl2['cluster'] = clust2
X_cl2.columns
X_cl2.groupby('cluster').median()
X_cl2.to_excel(r'C:\Users\mat.prokopenko\Desktop\intro\eda\nosex.xlsx')
X_cl2.groupby('cluster').median().to_excel(r'C:\Users\mat.prokopenko\Desktop\intro\eda\nosexclust.xlsx')
Spectral clustering w/o sex parameter
from sklearn.cluster import SpectralClustering
spnosex = SpectralClustering(n_clusters=4, affinity='rbf')
spnosex.fit(Xscalpcanosex)
clustsp1 = spnosex.labels_
xsp = Xscalpcanosex[:,0]
ysp = Xscalpcanosex[:,1]
plt.scatter(xsp, ysp, c = clustsp1)
plt.show
print(silhouette_score(Xscalpcanosex, clustsp1))
X_clsp1 = Xnosex.copy()
X_clsp1['cluster'] = clustsp1
X_clsp1.groupby('cluster').median()
X_clsp1.to_excel(r'C:\Users\mat.prokopenko\Desktop\intro\eda\nosexsp.xlsx')
X_clsp1.groupby('cluster').median().to_excel(r'C:\Users\mat.prokopenko\Desktop\intro\eda\nosexclustsp.xlsx')
Hierarchical clustering
# Perform the necessary imports
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt
# Calculate the linkage: mergings
mergings = linkage(Xscalpca, method = 'complete')
# Plot the dendrogram, using varieties as labels
dendrogram(mergings,
labels=range(0, len(Xscalpca)),
leaf_rotation=90,
leaf_font_size=10,
)
plt.show()
T-SNE
# Import TSNE
from sklearn.manifold import TSNE
# Create a TSNE instance: model
modelmap = TSNE(learning_rate = 100)
# Apply fit_transform to samples: tsne_features
tsne_features = modelmap.fit_transform(Xnosex)
# Select the 0th feature: xs
xm = tsne_features[:,0]
# Select the 1st feature: ys
ym = tsne_features[:,1]
# Scatter plot, coloring by variety_numbers
plt.scatter(xm, ym, c = clust2)
plt.show()
Let us use Random Forest to try to classify if a customer prefers Silpo ('silpo' = 1) or not
X = dffin1.drop('silpo', axis = 1)
y = dffin1[['silpo']]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
from sklearn.preprocessing import MinMaxScaler
scal = MinMaxScaler()
Xscaltr = scal.fit_transform(X_train)
Xscalts = scal.fit_transform(X_test)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=10, n_estimators = 100, random_state=0, bootstrap = True)
clf.fit(Xscaltr, y_train.values.ravel())
predrf = clf.predict(Xscalts)
from sklearn.metrics import accuracy_score as acc
print("Accuracy:", acc(y_test, predrf))
from sklearn.metrics import balanced_accuracy_score as bacc
print("Balanced accuracy:", bacc(y_test, predrf))
from sklearn.metrics import f1_score as f1
print("F1 accuracy:", f1(y_test, predrf))
import numpy as np
print((0.77 + 0.73 + 0.725)/3)
dffin1.columns
#let's look at sex
import plotly.express as px
fig = px.pie(dffin1, names = 'sex' , title='Sex dist')
fig.show()
#let's look at sex and OS preference
# 1 - man, 2 - woman
# 1 - Android, 2 - IOS
import plotly.express as px
fig = px.histogram(dffin1, x='os', facet_row='sex', histnorm='percent')
fig.show()
#let's look at sex and working status
# 1 - man, 2 - woman
# 0 - not working, 1 - working
import plotly.express as px
fig = px.histogram(dffin1, x='is_working', facet_row='sex', facet_col='is_studying', histnorm = 'percent')
fig.show()
#let's look at sex and shop preference
import plotly.express as px
fig = px.histogram(dffin1, x='silpo', facet_row='sex', facet_col='atb', histnorm = 'percent')
fig.show()